import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

# 加载数据
data = pd.read_csv('spam.csv')

# 数据诊断
print("原始数据形状:", data.shape)
print("缺失值统计:\n", data.isnull().sum())
print("\n原始数据前5行:\n", data.head())

# 清理数据
# 1. 检查spam列的值
print("\nSpam列的唯一值:", data['spam'].unique())

# 2. 填充spam列的缺失值（假设缺失值为非垃圾邮件）
data['spam'] = data['spam'].fillna(0)

# 3. 确保spam列为数值型且只包含0和1
data['spam'] = pd.to_numeric(data['spam'], errors='coerce')
data = data[data['spam'].isin([0, 1])]

# 4. 删除text列中的缺失值
data = data.dropna(subset=['text'])

# 验证清理后的数据
print("\n清理后数据形状:", data.shape)
print("清理后spam分布:\n", data['spam'].value_counts())

# 特征提取和模型训练（仅在数据量足够时执行）
if len(data) > 1:
    X = data['text']
    y = data['spam'].values

    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(X)

    # 划分训练集和测试集（如果数据量太少，增加test_size）
    test_size = 0.2 if len(data) > 10 else 0.1
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )

    # 创建和训练模型
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # 评估模型
    y_pred = model.predict(X_test)
    print("\n模型评估:")
    print("准确率:", accuracy_score(y_test, y_pred))
    print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))
else:
    print("\n错误：数据量不足，无法训练模型")